001 /* 002 * CharSequence.java 003 * 004 * Copyright 2003 Sergio Anibal de Carvalho Junior 005 * 006 * This file is part of NeoBio. 007 * 008 * NeoBio is free software; you can redistribute it and/or modify it under the terms of 009 * the GNU General Public License as published by the Free Software Foundation; either 010 * version 2 of the License, or (at your option) any later version. 011 * 012 * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; 013 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR 014 * PURPOSE. See the GNU General Public License for more details. 015 * 016 * You should have received a copy of the GNU General Public License along with NeoBio; 017 * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, 018 * Boston, MA 02111-1307, USA. 019 * 020 * Proper attribution of the author as the source of the software would be appreciated. 021 * 022 * Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net 023 * Department of Computer Science http://www.dcs.kcl.ac.uk 024 * King's College London, UK http://www.kcl.ac.uk 025 * 026 * Please visit http://neobio.sourceforge.net 027 * 028 * This project was supervised by Professor Maxime Crochemore. 029 * 030 */ 031 032 package neobio.alignment; 033 034 import java.io.Reader; 035 import java.io.BufferedReader; 036 import java.io.IOException; 037 038 /** 039 * This class implements a sequence of characters stored as an array that provides random 040 * access to any position in constant time. 041 * 042 * <P>The input can come from any source, provided it is encapsulated in a proper 043 * <CODE>Reader</CODE> instance. The stream is expected to be ready (i.e. the next 044 * <CODE>read</CODE> operation must return the first character of the sequence) and it is 045 * not closed when its end is reached, so the client is allowed to reset it and maybe use 046 * it for another purpose.</P> 047 * 048 * <P>Sequences can contain letters only although lines started with the 049 * <CODE>COMMENT_CHAR</CODE> character ('>') are regarded as comments and are completely 050 * skipped. White spaces (including tabs, line feeds and carriage returns) are also 051 * ignored throughout.</P> 052 * 053 * <P>This class is used by two sequence alignment algorithms: {@linkplain SmithWaterman} 054 * and {@linkplain NeedlemanWunsch}.</P> 055 * 056 * @author Sergio A. de Carvalho Jr. 057 * @see SmithWaterman 058 * @see NeedlemanWunsch 059 */ 060 public class CharSequence 061 { 062 /** 063 * The character used to start a comment line in a sequence file. When this character 064 * is found, the rest of the line is ignored. 065 */ 066 protected static final char COMMENT_CHAR = '>'; 067 068 /** 069 * Stores the sequence as an array of characters. 070 */ 071 protected char sequence[]; 072 073 /** 074 * Creates a new instance of a <CODE>CharSequence</CODE>, loading the sequence data 075 * from the <CODE>Reader</CODE> input stream. 076 * 077 * @param reader source of characters for this sequence 078 * @throws IOException if an I/O exception occurs when reading the input 079 * @throws InvalidSequenceException if the input does not contain a valid sequence 080 */ 081 public CharSequence (Reader reader) throws IOException, InvalidSequenceException 082 { 083 int ch; 084 char c; 085 086 BufferedReader input = new BufferedReader(reader); 087 088 StringBuffer buf = new StringBuffer(); 089 090 // read characters 091 while ((ch = input.read()) != -1) 092 { 093 // conver to char 094 c = (char) ch; 095 096 // skip line if comment character is found 097 if (c == COMMENT_CHAR) 098 input.readLine(); 099 100 // accept letters only 101 else if (Character.isLetter(c)) 102 buf.append(c); 103 104 // anything else, except whitespaces, will throw an exception 105 else if (!Character.isWhitespace(c)) 106 throw new InvalidSequenceException 107 ("Sequences can contain letters only."); 108 } 109 110 // check if read anything! 111 if (buf.length() > 0) 112 sequence = new char[buf.length()]; 113 else 114 throw new InvalidSequenceException ("Empty sequence."); 115 116 // copy data to 117 buf.getChars(0, buf.length(), sequence, 0); 118 } 119 120 /** 121 * Returns the number of characters of this sequence. 122 * 123 * @return int number of characters of this sequence 124 */ 125 public int length () 126 { 127 return sequence.length; 128 } 129 130 /** 131 * Returns the character at a given position. For the client, the first character is 132 * at position 1, while the last character is at position <CODE>length()</CODE>. This 133 * is convinient for sequence alignment algorithms based on a classic dynamic 134 * programming matrix since the sequences usually start at row/column 1. This method 135 * does not check boundaries, therefore an <CODE>ArrayIndexOutOfBoundsException</CODE> 136 * may be raised if <CODE>pos</CODE> is out of bounds. 137 * 138 * @param pos position of character (from 1 to <CODE>length()</CODE> inclusive) 139 * @return the character 140 */ 141 public char charAt (int pos) 142 { 143 // convert from one-based to zero-based index 144 return sequence[pos-1]; 145 } 146 147 /** 148 * Returns a string representation of the sequence. 149 * 150 * @return a string representation of the sequence 151 */ 152 public String toString () 153 { 154 return new String(sequence); 155 } 156 }